In this notebook we will examine the human resources data that was prepared in the Data Wrangling notebook. A detailed exploration of the data will provide insights into the employee body of company XYZ and help us understand which features are useful for class separation by Machine Learning models.
library(ggplot2) # Visulazition tools
library(corrplot) # Correlation plot
## corrplot 0.84 loaded
library(repr) # Set plot dimensions
library(cowplot) # Plot multiple plots together
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
library(gridExtra)
library(GGally) # Adds pair-wise scatter plots to ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.0.3 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ──────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr) #
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
options(repr.plot.width=4, repr.plot.height=4) # Set the initial plot area dimensions
dt <- read.csv("FinalData.csv")
# Remove the first column created by exporting/loading the csv file.
dt %<>% select(-1)
# Convert ordered categorical columns to factors.
dt$Education <- ordered(dt$Education,
levels = c("Below College", "College", "Bachelor", "Master", "Doctor"))
dt$BusinessTravel <- ordered(dt$BusinessTravel,
levels = c("Non-Travel", "Travel-Rarely", "Travel-Frequently"))
dt$JobLevel <- ordered(dt$JobLevel,
levels = c(1, 2, 3, 4, 5),
labels = c("1", "2", "3", "4", "5"))
dt$StockOptionLevel <- ordered(dt$StockOptionLevel,
levels = c(0, 1, 2, 3),
labels = c("0", "1", "2", "3"))
dt$EnvironmentSatisfaction <- ordered(dt$EnvironmentSatisfaction,
levels = c("N/A","Low", "Medium", "High", "Very High"))
dt$JobInvolvement <- ordered(dt$JobInvolvement,
levels = c("Low", "Medium", "High", "Very High"))
dt$JobSatisfaction <- ordered(dt$JobSatisfaction,
levels = c("N/A","Low", "Medium", "High", "Very High"))
dt$PerformanceRating <- ordered(dt$PerformanceRating,
levels = c("Low", "Good", "Excellent", "Outstanding"))
dt$WorkLifeBalance <- ordered(dt$WorkLifeBalance,
levels = c("N/A","Bad", "Good", "Better", "Best"))
dt$Attrition <- ordered(dt$Attrition,
levels = c("Stayed", "Left"))
# Convert other categorical variables to the correct type.
catcols <- c("Department", "EducationField", "Gender", "JobRole", "MaritalStatus")
dt %<>% mutate_at(catcols, factor)
A quick view to make sure the data was loaded correctly.
dim(dt)
## [1] 4410 26
# There are 4410 instances, or employees documented in the dataset, with 26 variables.
head(dt)
## Age Attrition BusinessTravel Department DistanceFromHome
## 1 51 Stayed Travel-Rarely Sales 6
## 2 31 Left Travel-Frequently Research & Development 10
## 3 32 Stayed Travel-Frequently Research & Development 17
## 4 38 Stayed Non-Travel Research & Development 2
## 5 32 Stayed Travel-Rarely Research & Development 10
## 6 46 Stayed Travel-Rarely Research & Development 8
## Education EducationField Gender JobLevel JobRole
## 1 College Life Sciences Female 1 Healthcare Representative
## 2 Below College Life Sciences Female 1 Research Scientist
## 3 Master Other Male 4 Sales Executive
## 4 Doctor Life Sciences Male 3 Human Resources
## 5 Below College Medical Male 1 Sales Executive
## 6 Bachelor Life Sciences Female 4 Research Director
## MaritalStatus MonthlyIncome NumCompaniesWorked PercentSalaryHike
## 1 Married 131160 1 11
## 2 Single 41890 0 23
## 3 Married 193280 1 15
## 4 Married 83210 3 11
## 5 Single 23420 4 12
## 6 Married 40710 3 13
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany
## 1 0 1 6 1
## 2 1 6 3 5
## 3 3 5 2 5
## 4 3 13 5 8
## 5 2 9 2 6
## 6 0 28 5 7
## YearsSinceLastPromotion YearsWithCurrManager EnvironmentSatisfaction
## 1 0 0 High
## 2 1 4 High
## 3 0 3 Medium
## 4 7 5 Very High
## 5 0 4 Very High
## 6 7 7 High
## JobSatisfaction WorkLifeBalance JobInvolvement PerformanceRating AvgHrs
## 1 Very High Good High Excellent 7.37
## 2 Medium Best Medium Outstanding 7.72
## 3 Medium Bad High Excellent 7.01
## 4 Very High Better Medium Excellent 7.19
## 5 Low Better High Excellent 8.01
## 6 Medium Good High Excellent 10.80
str(dt)
## 'data.frame': 4410 obs. of 26 variables:
## $ Age : int 51 31 32 38 32 46 28 29 31 25 ...
## $ Attrition : Ord.factor w/ 2 levels "Stayed"<"Left": 1 2 1 1 1 1 2 1 1 1 ...
## $ BusinessTravel : Ord.factor w/ 3 levels "Non-Travel"<"Travel-Rarely"<..: 2 3 3 1 2 2 2 2 2 1 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ DistanceFromHome : int 6 10 17 2 10 8 11 18 1 7 ...
## $ Education : Ord.factor w/ 5 levels "Below College"<..: 2 1 4 5 1 3 2 3 3 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 1 2 2 2 1 ...
## $ JobLevel : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 1 1 4 3 1 4 2 2 3 4 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 1 7 8 2 8 6 8 8 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 2 3 2 2 3 2 3 2 2 1 ...
## $ MonthlyIncome : int 131160 41890 193280 83210 23420 40710 58130 31430 20440 134640 ...
## $ NumCompaniesWorked : int 1 0 1 3 4 3 2 2 0 1 ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ StockOptionLevel : Ord.factor w/ 4 levels "0"<"1"<"2"<"3": 1 2 4 4 3 1 2 4 1 2 ...
## $ TotalWorkingYears : int 1 6 5 13 9 28 5 10 10 6 ...
## $ TrainingTimesLastYear : int 6 3 2 5 2 5 2 2 2 2 ...
## $ YearsAtCompany : int 1 5 5 8 6 7 0 0 9 6 ...
## $ YearsSinceLastPromotion: int 0 1 0 7 0 7 0 0 7 1 ...
## $ YearsWithCurrManager : int 0 4 3 5 4 7 0 0 8 5 ...
## $ EnvironmentSatisfaction: Ord.factor w/ 5 levels "N/A"<"Low"<"Medium"<..: 4 4 3 5 5 4 2 2 3 3 ...
## $ JobSatisfaction : Ord.factor w/ 5 levels "N/A"<"Low"<"Medium"<..: 5 3 3 5 2 3 4 3 5 2 ...
## $ WorkLifeBalance : Ord.factor w/ 5 levels "N/A"<"Bad"<"Good"<..: 3 5 2 4 4 3 2 4 4 4 ...
## $ JobInvolvement : Ord.factor w/ 4 levels "Low"<"Medium"<..: 3 2 3 2 3 3 3 3 3 3 ...
## $ PerformanceRating : Ord.factor w/ 4 levels "Low"<"Good"<"Excellent"<..: 3 4 3 3 3 3 4 4 4 3 ...
## $ AvgHrs : num 7.37 7.72 7.01 7.19 8.01 10.8 6.92 6.73 7.24 7.08 ...
To determine what features distinguish who leaves or stays, it is also important to understand the employee demographic as a whole. We’ll start by looking at all employees.
First, here is a summary view of the numeric columns of the dataset. Summary function does not provide standard deviations and must be calculated separately.
# Identify numeric columns.
numcols <- dt %>% select_if(is.numeric) %>% colnames
summary(dt[,numcols])
## Age DistanceFromHome MonthlyIncome NumCompaniesWorked
## Min. :18.00 Min. : 1.000 Min. : 10090 Min. :0.000
## 1st Qu.:30.00 1st Qu.: 2.000 1st Qu.: 29110 1st Qu.:1.000
## Median :36.00 Median : 7.000 Median : 49190 Median :2.000
## Mean :36.92 Mean : 9.193 Mean : 65029 Mean :2.692
## 3rd Qu.:43.00 3rd Qu.:14.000 3rd Qu.: 83800 3rd Qu.:4.000
## Max. :60.00 Max. :29.000 Max. :199990 Max. :9.000
## PercentSalaryHike TotalWorkingYears TrainingTimesLastYear YearsAtCompany
## Min. :11.00 Min. : 0.00 Min. :0.000 Min. : 0.000
## 1st Qu.:12.00 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.: 3.000
## Median :14.00 Median :10.00 Median :3.000 Median : 5.000
## Mean :15.21 Mean :11.28 Mean :2.799 Mean : 7.008
## 3rd Qu.:18.00 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.: 9.000
## Max. :25.00 Max. :40.00 Max. :6.000 Max. :40.000
## YearsSinceLastPromotion YearsWithCurrManager AvgHrs
## Min. : 0.000 Min. : 0.000 Min. : 5.950
## 1st Qu.: 0.000 1st Qu.: 2.000 1st Qu.: 6.670
## Median : 1.000 Median : 3.000 Median : 7.410
## Mean : 2.188 Mean : 4.123 Mean : 7.701
## 3rd Qu.: 3.000 3rd Qu.: 7.000 3rd Qu.: 8.370
## Max. :15.000 Max. :17.000 Max. :11.030
# Calculate the standard deviation
for(col in numcols){
cat(paste(col, as.character(round(sd(dt[,col]), 2)), '\n'))
}
## Age 9.13
## DistanceFromHome 8.11
## MonthlyIncome 47068.89
## NumCompaniesWorked 2.49
## PercentSalaryHike 3.66
## TotalWorkingYears 7.77
## TrainingTimesLastYear 1.29
## YearsAtCompany 6.13
## YearsSinceLastPromotion 3.22
## YearsWithCurrManager 3.57
## AvgHrs 1.34
Insights
Next is a frequency table of the categorical features of the dataset.
summary(dt[,!colnames(dt) %in% numcols])
## Attrition BusinessTravel Department
## Stayed:3699 Non-Travel : 450 Human Resources : 189
## Left : 711 Travel-Rarely :3129 Research & Development:2883
## Travel-Frequently: 831 Sales :1338
##
##
##
##
## Education EducationField Gender JobLevel
## Below College: 510 Human Resources : 81 Female:1764 1:1629
## College : 846 Life Sciences :1818 Male :2646 2:1602
## Bachelor :1716 Marketing : 477 3: 654
## Master :1194 Medical :1392 4: 318
## Doctor : 144 Other : 246 5: 207
## Technical Degree: 396
##
## JobRole MaritalStatus StockOptionLevel
## Sales Executive :978 Divorced: 981 0:1893
## Research Scientist :876 Married :2019 1:1788
## Laboratory Technician :777 Single :1410 2: 474
## Manufacturing Director :435 3: 255
## Healthcare Representative:393
## Manager :306
## (Other) :645
## EnvironmentSatisfaction JobSatisfaction WorkLifeBalance JobInvolvement
## N/A : 25 N/A : 20 N/A : 38 Low : 249
## Low : 845 Low : 860 Bad : 239 Medium :1125
## Medium : 856 Medium : 840 Good :1019 High :2604
## High :1350 High :1323 Better:2660 Very High: 432
## Very High:1334 Very High:1367 Best : 454
##
##
## PerformanceRating
## Low : 0
## Good : 0
## Excellent :3732
## Outstanding: 678
##
##
##
# Percent of employees who left the company last year.
print(paste("Percent of employees who left the company last year: " ,
round(nrow(dt[dt$Attrition=="Left",])/nrow(dt)*100, 2), "%",
sep = ""))
## [1] "Percent of employees who left the company last year: 16.12%"
Insights
dt %>% select_if(is.numeric) %>% cor %>% corrplot(order="FPC", tl.col="black")
ggpairs(dt[,numcols])
count = 1
myplots <- list()
for(col in numcols){
options(repr.plot.width=8, repr.plot.height=8)
myplots[[count]] <- ggplot(dt, aes_string(col)) +
geom_density() +
ggtitle(paste(col)) +
theme_light()
count = count + 1
}
plot_grid(plotlist=myplots, ncol=3)
Most of the numeric variables are skewed to the left. Because the number of years are counted in integers, some plots look discrete. In these cases a bar graph may be more representative. Interestingly, there seem to be two peaks in YearsWithCurrManager. This may be useful for seperation of classes.
Next we will look at Attrition separation by numeric features.
Boxplots help visualize whether the quartiles overlap between label cases while showing statistical outliers for each feature. On the other hand, violin plots show a more detailed view of the distribution of the population.
mycolors =c("#00AFBB", "#E7B800")
names(mycolors) =c("Stayed", "Left")
dt_vio <- dt %>% select(Attrition, all_of(numcols))
count = 1
myplots <- list()
for(col in numcols){
myplots[[count]] <- ggplot(dt_vio, aes_string("Attrition", col)) +
geom_violin(aes(fill = Attrition), trim = FALSE) +
geom_boxplot(width = 0.15)+
scale_fill_manual(values = mycolors)+
ggtitle(paste(col)) +
theme_light() +
theme(legend.position = "none",
axis.title.y = element_blank())
count = count + 1
}
plot_grid(plotlist=myplots, ncol=3)
Insights
Let’s examine the categorical features of the Attrition groups, as well as the entire population. ##### 1. Bar plots of categorical features
# Visualize by categorical features
catcols <- dt %>% select(negate(is.numeric), -Attrition) %>% colnames
print(catcols)
## [1] "BusinessTravel" "Department"
## [3] "Education" "EducationField"
## [5] "Gender" "JobLevel"
## [7] "JobRole" "MaritalStatus"
## [9] "StockOptionLevel" "EnvironmentSatisfaction"
## [11] "JobSatisfaction" "WorkLifeBalance"
## [13] "JobInvolvement" "PerformanceRating"
plot_bars = function(df, cols){
options(repr.plot.width=3, repr.plot.height=4) # Set the initial plot area dimensions
dt_stayed = df[df$Attrition == "Stayed",]
dt_left = df[df$Attrition == "Left",]
for(col in cols){
p1 = ggplot(dt_stayed, aes_string(col)) +
geom_bar(alpha=0.8, fill=mycolors["Stayed"]) +
geom_text(aes(label = scales::percent(..prop.., accuracy=0.1), group = 1),
stat= "count", vjust = -.5, size=2.5) +
ggtitle(paste('Barplot of', col, '\n for Employees who Stayed')) +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 = ggplot(dt_left, aes_string(col)) +
geom_bar(alpha=0.8, fill=mycolors["Left"]) +
geom_text(aes(label = scales::percent(..prop.., accuracy=0.1), group = 1),
stat= "count", vjust = -.5, size=2.5) +
ggtitle(paste('Barplot of', col, '\n for Employees who Left')) +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grid.arrange(p1,p2, nrow = 1)
}
}
plot_bars(dt, catcols)
# Faceted(scaled) view of the same data
plot_bars2 = function(df, cols){
options(repr.plot.width=6, repr.plot.height=6) # Set the initial plot area dimensions
for(col in cols){
p = ggplot(df, aes_string(col)) +
geom_bar(aes(fill = Attrition)) +
ggtitle(paste('Barplot of', col,'by Attrition')) +
facet_grid(cols = vars(Attrition), margins = T) +
geom_text(aes(label = scales::percent(..prop.., accuracy=0.1), group = 1),
stat= "count", vjust = -.5, size=2.5) +
scale_fill_manual(values=c(mycolors["Stayed"], mycolors["Left"], "#999999")) +
theme_light() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
}
plot_bars2(dt, catcols)
Insights
There are several features, both numeric and categorical, that appear to differentiate employees who leave from the employees who stay. It’s hard to tell by looking at the visualization how and to what extent these features affect the probability of attrition. Therefore a machine learning algorithm that can take into consideration all the different factors would be beneficial for further understand what leads employees leaving and how to minimize attrition.
Furthermore, this exploratory data analysis was performed in Tableau.